#!/usr/bin/env python
# encoding=utf8

## Database: status error codes
#  0 : not parsed
#  1 : parsed correctly
#  2 : paged not found
#  3 : attempted but something went wrong
#  4 : username didn't exist when checked timeline
#  5 : don't appear to be friends when comparing friend list


# Set email specification to receive news from the bot
##
# me == my email address
# you == recipient's email address
me = "" # REDACTED
you = "" # REDACTED
pwd = "" # REDACTED
# Every how many pages do you want to receive an email?
mail_msg_threshold = 100
mail_spec = {'me' : me, 'you' : you, 'pwd' : pwd, 'msg_threshold' : mail_msg_threshold}
##

import os
import sqlite3
import re
from random import randint
from time import sleep
import datetime
from datetime import datetime, time, date
from random import randrange
import sys
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import html2text
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

# Functions


def updateDbWithTimelineInfo(db, db_timeline):

    print "Updating database..."

    try:
        conn = sqlite3.connect(db_timeline)
        cursor = conn.cursor()
        cursor.execute("SELECT fb_id FROM fbUsersTimeline WHERE status = 2")
    except Exception,e:
        print str(e)
        return

    nopage_list = []

    for (fb_id,) in cursor.fetchall():
        nopage_list.append(fb_id.encode("utf-8"))

    conn.commit() 

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        for nopage in nopage_list:
            cursor.execute("UPDATE sampleFbFriendship SET status = 4 WHERE [from] = ?;", (nopage,))
            cursor.execute("UPDATE sampleFbFriendship SET status = 4 WHERE [to] = ?;", (nopage,))
        conn.commit()
        print "Done"
    except Exception,e:
        print str(e)

    return


def queryDb(db):

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        cursor.execute("SELECT [from], [to] FROM fb_friendship_edgelist WHERE status = 0")
    except Exception,e:
        print str(e)

    query_list = []
    
    for (from_, to_) in cursor.fetchall():
        query_list.append({'from_':from_.encode("utf-8"),
                           'to_':to_.encode("utf-8")})

    return query_list

def insertDb(db, obj):

    # Prepare timestamp
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    try:
        conn = sqlite3.connect(db)
        cursor = conn.cursor()
        cursor.execute("UPDATE fb_friendship_edgelist SET friends = ?, since = ?, grew_up = ?, living = ?, status = ?, timestamp=? WHERE [from] = ? AND [to] = ?;", (obj['friends'], obj['since'], obj['grew_up'], obj['living'], obj['status'], now, obj['from_'], obj['to_']))
        conn.commit()    

        return

    except Exception,e:
        print str(e)

def parseTimelineElement(unicode, from_, to_):

    if matchDate(unicode) is not None:
        friends = True
    else:
        friends = False

    dict = prepareDict(friends,
                        matchDate(unicode),
                        matchPlace(unicode)['grew_up'],
                        matchPlace(unicode)['living'],
                        1,
                        from_,
                        to_)

    return dict
        

def prepareDict(friends, since, grew_up, living, status, from_, to_):

    dict = {'friends' : friends,
            'since' : since,
            'grew_up' : grew_up,
            'living' : living, 
            'status' : status,
            'from_' : from_,
            'to_' : to_}

    return dict


def getFirst(iterable, default=None):
    if iterable:
        for item in iterable:
            return item
    return default


def matchDate(unicode):

    return getFirst(re.findall('Facebook friends since (.*?)(?:$|\n)', unicode, re.DOTALL))


def matchPlace(unicode):

    ## TEST for different occurrences
    ## str1 = 'Grew up in Novara, Italy and living in Bergamo, Italy'
    ## str2 = 'Grew up and living in Novara, Italy'
    ## str3 = 'Living in Novara, Italy'
    ## a = 'Facebook friends since August 2008\nLiving in Paris, France'

    dict = {'grew_up' : None, 'living' : None}

    if re.search('Grew up in', unicode) is not None:
        dict['grew_up'] = getFirst(re.findall('Grew up in (.*?)(?:$|\n|and)', unicode, re.DOTALL))

        if re.search('and', unicode) is not None:
            dict['living'] = getFirst(re.findall('and living in (.*?)(?:$|\n)', unicode, re.DOTALL))

    if re.search('Grew up and living in', unicode) is not None:
        dict['grew_up'] = getFirst(re.findall('Grew up and living in (.*?)(?:$|\n)', unicode, re.DOTALL))
        dict['living'] = dict['grew_up']

    if re.search('Living in', unicode) is not None:
        dict['living'] = getFirst(re.findall('Living in (.*?)(?:$|\n)', unicode, re.DOTALL))

    return dict


def composeUrl(from_, to_):

    regexp1 = re.escape('://www.facebook.com/')
    regexp2 = re.escape('profile.php?id=')
    
    if re.search(regexp1, from_) is not None:
        re_match = re.search('(?<=www.facebook.com/)(.*)', from_)
        from_ = re_match.group(1)

        if re.search(regexp2, from_) is not None:
            from_ = re.search('(?<=profile.php\\?id\\=)(.*)', from_).group(1)

    if re.search(regexp1, to_) is not None:
        re_match = re.search('(?<=www.facebook.com/)(.*)', to_)
        to_ = re_match.group(1)

        if re.search(regexp2, to_) is not None:
            to_ = re.search('(?<=profile.php\\?id\\=)(.*)', to_).group(1)

    url = 'https://www.facebook.com/' + from_ + '?and=' + to_

    return url

# Expect three arguments: msg (string or list), error (bool), end (bool)
# Return an html formatted string
def composeMailMessage(error):

    if error == True:
        html = "<p>Hi!<br>Here's you bot scraping friendship pages.<br>Something went wrong and I quit.<br>Have a nice day."
        return html

def sendMail(me, you, pwd, html):

    try:
    
        # Create message container - the correct MIME type is multipart/alternative.
        msg = MIMEMultipart('alternative')
        msg['Subject'] = "Message from your bot scraping friendship pages"
        msg['From'] = me
        msg['To'] = you 

        # Create the body of the message (a plain-text and an HTML version).
        text = html2text.html2text(html)
        html = """\
        <html>
        <head></head>
        <body>
        %s
        </body>
        </html>
        """ % (html)

        # Record the MIME types of both parts - text/plain and text/html.
        part1 = MIMEText(text, 'plain')
        part2 = MIMEText(html, 'html')

        # Attach parts into message container.
        # According to RFC 2046, the last part of a multipart message, in this case
        # the HTML message, is best and preferred.
        msg.attach(part1)
        msg.attach(part2)

        # Send the message via local SMTP server.
        mailServer = smtplib.SMTP("smtp.gmail.com", 587)
        mailServer.ehlo()
        mailServer.starttls()
        mailServer.ehlo()
        mailServer.login(me, pwd)
        # sendmail function takes 3 arguments: sender's address, recipient's address
        # and message to send - here it is sent as one string.
        mailServer.sendmail(me, you, msg.as_string())
        mailServer.close()

    except Exception,e:
        print str(e)
        pass
        
    return

def main():

    # Connect to database and parse all edges
    db = ('friendship.sqlite')
    db_timeline = 'fb_users_timeline.sqlite'
    db_edgelist = 'fb_users_edgelist.sqlite'
    chromedriver = "chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver

    updateDbWithTimelineInfo(db, db_timeline)
    
    query_list = queryDb(db)

    print "Yet to parse " + str(len(query_list)) + " relationships"
    
    driver = webdriver.Chrome(chromedriver)
    driver.implicitly_wait(60)

    driver.get('http://www.facebook.com/')

    try:
        driver.find_element_by_xpath(".//*[@id='email']").send_keys("") # REDACTED
        driver.find_element_by_xpath(".//*[@id='pass']").send_keys("") # REDACTED
        driver.find_element_by_xpath("//input[@type='submit'][@value='Log In' or @value='Login']").click()
    except:
        pass

    sleep(randint(10,40))

    ## OUTER LOOP 
    for pair in query_list:
        url = composeUrl(pair['from_'], pair['to_'])

        ## # Sleep between 22 and 8 adding random noise
        ## min1 = randrange(10,60)
        ## min2 = randrange(10,60)
        ## now = datetime.now()
        ## print "Current time: " + str(now)
        ## now_time = now.time()
        ## if now_time >= time(13,min1) and now_time <= time(21,min2):
        ##     print "Sleeping until " + str(time(9,min2))
        ##     sleep_time = datetime.combine(date.today(), time(21,min2)) - datetime.combine(date.today(), now_time)
        ##     sleep(sleep_time.seconds)

        print(url)

        driver.get(url)

        try:
            element = WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.ID, "pagelet_timeline_main_column")))

        except:
            html = driver.page_source
            soup = BeautifulSoup(html)
            regexp = re.compile(r'Page not found')
            try:
                if regexp.search(soup.head.title.text) is not None:
                    dict = prepareDict(None, None, None, None, 2, pair['from_'], pair['to_'])
                    print dict
                    insertDb(db, dict)
                    sleep(randint(10,40))
                    continue
            except:
                sleep(randint(10,40))
                continue
            fb_homepage_regex = re.compile(r'^(\([0-9]*\) )?Facebook$')
            if fb_homepage_regex.search(soup.head.title.text) is not None:
                print "Page not returned (not existing or privacy)"
                dict = prepareDict(None, None, None, None, 2, pair['from_'], pair['to_'])
                print dict
                insertDb(db, dict)
                sleep(randint(10,40))
                continue
            else:
                driver.quit()
                mail_message = composeMailMessage(error=True)
                sendMail(me, you, pwd, mail_message)
                sys.exit()

        else:
            try: 
                # Get element containing common details
                timeline_summary = driver.find_element_by_class_name("fbTimelineSummarySection")
                # Parse content of box
                dict = parseTimelineElement(timeline_summary.text, pair['from_'], pair['to_'])
                # Insert into the database
                print dict
                insertDb(db, dict)
                sleep(randint(10,40))
            except NoSuchElementException, e:
                print str(e)
                continue

    mail_message = composeMailMessage(error=False)
    sendMail(me, you, pwd, mail_message)
    driver.quit()


main()
